# Necessary libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Cargando paquete requerido: lattice
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Adjuntando el paquete: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(mclust)
## Warning: package 'mclust' was built under R version 4.4.3
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
library(aricode)
## Warning: package 'aricode' was built under R version 4.4.3

Exploratory Data Analysis

Data Set Information

Basketball Reference’s WNBA per-100-possession data for the seasons 1997 to 2024

Stat Description
Player Player Name
Team Team
Pos Position
G Games
MP Minutes Played
GS Games Started
FG Field Goals (includes both 2-point and 3-point field goals)
FGA Field Goal Attempts (includes both 2-point and 3-point attempts)
FG% Field Goal Percentage; formula: FG / FGA.
3P 3-Point Field Goals
3P% 3-Point Field Goal Percentage; the formula is 3P / 3PA.
3PA 3-Point Field Goal Attempts
2P 2-Point Field Goals
2PA 2-Point Field Goal Attempts
2P% 2-Point Field Goal Percentage; the formula is 2P / 2PA.
FT Free Throws
FT% Free Throw Percentage; formula: FT / FTA.
FTA Free Throw Attempts
ORB Offensive Rebounds
TRB Total Rebounds
AST Assists
STL Steals
BLK Blocks
TOV Turnovers
PF Personal Fouls
PTS Points
Season Season Start Year
script_dir <- getwd()
data_directory <- file.path(script_dir, 'data')
file_path <- file.path(data_directory, 'per_100.csv')
data <- read.csv(file_path)

data_shape <- dim(data)
print(paste("Number of rows:", data_shape[1]))
## [1] "Number of rows: 4935"
print(paste("Number of columns:", data_shape[2]))
## [1] "Number of columns: 29"
columns <- colnames(data)
print("Columns:")
## [1] "Columns:"
print(columns)
##  [1] "Player" "Team"   "Pos"    "G"      "MP"     "G.1"    "GS"     "MP.1"  
##  [9] "FG"     "FGA"    "FG."    "X3P"    "X3PA"   "X3P."   "X2P"    "X2PA"  
## [17] "X2P."   "FT"     "FTA"    "FT."    "ORB"    "TRB"    "AST"    "STL"   
## [25] "BLK"    "TOV"    "PF"     "PTS"    "Season"
unique_players <- length(unique(data$Player))
print(paste("Number of unique players:", unique_players))
## [1] "Number of unique players: 1095"
positions <- unique(data$Pos)
players_per_position <- sapply(positions, function(pos) sum(data$Pos == pos))
print("Players per Position:")
## [1] "Players per Position:"
print(players_per_position)
##    F    G  G-F  F-C    C  F-G  C-F 
## 1475 2009  230  312  686  110  113

Data Pre-Processing

data$Pos <- gsub("G-F", "F-G", data$Pos)
data$Pos <- gsub("C-F", "F-C", data$Pos)
# data$Pos <- substr(data$Pos, 1, 1)

players_per_position <- sapply(positions, function(pos) sum(data$Pos == pos))
print("Players per Position:")
## [1] "Players per Position:"
print(players_per_position)
##    F    G  G-F  F-C    C  F-G  C-F 
## 1475 2009    0  425  686  340    0
similarity_g <- sum(data[, "G"] == data[, "G.1"])/length(data[, "G"])
similarity_mp <- sum(data[, "MP"] == data[, "MP.1"])/length(data[, "MP"])

if (similarity_g > 0.9) {
  print(paste("Columns G and G.1 are", round(similarity_g * 100, 2), "% identical"))
  data <- data[, -which(colnames(data) == "G.1")]
}
## [1] "Columns G and G.1 are 100 % identical"
if (similarity_mp > 0.9) {
  print(paste("Columns MP and MP.1 are", round(similarity_mp * 100, 2), "% identical"))
  data <- data[, -which(colnames(data) == "MP.1")]
}
## [1] "Columns MP and MP.1 are 100 % identical"
columns <- colnames(data)
print("New Columns:")
## [1] "New Columns:"
print(columns)
##  [1] "Player" "Team"   "Pos"    "G"      "MP"     "GS"     "FG"     "FGA"   
##  [9] "FG."    "X3P"    "X3PA"   "X3P."   "X2P"    "X2PA"   "X2P."   "FT"    
## [17] "FTA"    "FT."    "ORB"    "TRB"    "AST"    "STL"    "BLK"    "TOV"   
## [25] "PF"     "PTS"    "Season"
column_types <- sapply(data, class)
print("Column Types:")
## [1] "Column Types:"
print(column_types)
##      Player        Team         Pos           G          MP          GS 
## "character" "character" "character"   "integer"   "integer"   "integer" 
##          FG         FGA         FG.         X3P        X3PA        X3P. 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##         X2P        X2PA        X2P.          FT         FTA         FT. 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##         ORB         TRB         AST         STL         BLK         TOV 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##          PF         PTS      Season 
##   "numeric"   "numeric"   "integer"
data$Season <- as.character(data$Season)

missing_values <- sum(is.na(data))
print(paste(" Total Missing Values:", missing_values))
## [1] " Total Missing Values: 1513"
missing_values_by_column <- colSums(is.na(data))
print("Missing Values by Column:")
## [1] "Missing Values by Column:"
print(missing_values_by_column)
## Player   Team    Pos      G     MP     GS     FG    FGA    FG.    X3P   X3PA 
##      0      0      0      0      0      0      8      8     63      8      8 
##   X3P.    X2P   X2PA   X2P.     FT    FTA    FT.    ORB    TRB    AST    STL 
##    888      8      8     91      8      8    343      8      8      8      8 
##    BLK    TOV     PF    PTS Season 
##      8      8      8      8      0
data_na <- data[is.na(data$FG),]
print("Rows with NA in FG column:")
## [1] "Rows with NA in FG column:"
print(data_na)
##                Player Team Pos G MP GS FG FGA FG. X3P X3PA X3P. X2P X2PA X2P.
## 2127    Wanisha Smith  DET   G 1  0  0 NA  NA  NA  NA   NA   NA  NA   NA   NA
## 2150 Brittany Wilkins  SAS   C 1  0  0 NA  NA  NA  NA   NA   NA  NA   NA   NA
## 2181    Crystal Smith  MIN   G 1  0  0 NA  NA  NA  NA   NA   NA  NA   NA   NA
## 3345   Ameryst Alston  NYL   G 1  0  0 NA  NA  NA  NA   NA   NA  NA   NA   NA
## 3949 Angel McCoughtry  ATL F-G 1  0  1 NA  NA  NA  NA   NA   NA  NA   NA   NA
## 4054      Emma Cannon  LVA   F 1  0  0 NA  NA  NA  NA   NA   NA  NA   NA   NA
## 4276 Angel McCoughtry  LVA F-G 1  0  0 NA  NA   0  NA   NA   NA  NA   NA    0
## 4610       Lauren Cox  CON   F 1  0  0 NA  NA  NA  NA   NA   NA  NA   NA   NA
##      FT FTA FT. ORB TRB AST STL BLK TOV PF PTS Season
## 2127 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2008
## 2150 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2008
## 2181 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2008
## 3345 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2016
## 3949 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2019
## 4054 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2020
## 4276 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2021
## 4610 NA  NA  NA  NA  NA  NA  NA  NA  NA NA  NA   2023

We can confidently drop these rows, as it seems it’s simply players that did not play that year. Possibly due to injury or other reasons.

data <- data[!data$MP == 0,]

data_na <- data[is.na(data$FG.),]
print("Rows with NA in FG. column:")
## [1] "Rows with NA in FG. column:"
print(data_na)
##                        Player Team Pos G MP GS FG FGA FG. X3P X3PA X3P. X2P
## 68            Ryneldi Becenti  PHO   G 1  8  0  0   0  NA   0    0   NA   0
## 88                 Kim Gessig  LAS   F 1  4  0  0   0  NA   0    0   NA   0
## 299       Kellie Jolly Harper  CLE   G 1  4  0  0   0  NA   0    0   NA   0
## 304    MerleLynn Lange-Harris  PHO F-C 1  3  0  0   0  NA   0    0   NA   0
## 308             Rebecca Lobo*  NYL   C 1  1  1  0   0  NA   0    0   NA   0
## 587          Beverly Williams  IND   G 1  3  0  0   0  NA   0    0   NA   0
## 602         Kristen Rasmussen  UTA   F 1  9  0  0   0  NA   0    0   NA   0
## 603             Angela Aycock  MIN   G 3  6  0  0   0  NA   0    0   NA   0
## 610        Mactabene Amachree  NYL   F 2  3  0  0   0  NA   0    0   NA   0
## 648          Keitha Dickerson  UTA   F 4  6  0  0   0  NA   0    0   NA   0
## 787              Levys Torres  MIA   C 2  8  0  0   0  NA   0    0   NA   0
## 810                Dana Wynne  SAC   F 1  3  0  0   0  NA   0    0   NA   0
## 820            Monique Ambers  SAC   F 2  4  0  0   0  NA   0    0   NA   0
## 947             Shantia Owens  CHA   F 2  6  0  0   0  NA   0    0   NA   0
## 972           Elena Shakirova  CHA   C 1  5  0  0   0  NA   0    0   NA   0
## 1038          Katryna Gaither  WAS   C 1  1  0  0   0  NA   0    0   NA   0
## 1092         Bethany Donaphin  NYL F-C 1  2  0  0   0  NA   0    0   NA   0
## 1204       Itoro Umoh-Coleman  HOU   G 3  6  0  0   0  NA   0    0   NA   0
## 1259            Amisha Carter  DET   F 2 13  0  0   0  NA   0    0   NA   0
## 1284      Shaunzinski Gortman  WAS   G 4 24  0  0   0  NA   0    0   NA   0
## 1375            Stacey Thomas  DET   F 1  1  0  0   0  NA   0    0   NA   0
## 1403          Jae Kingi-Cross  DET   G 5 15  0  0   0  NA   0    0   NA   0
## 1590             Kiesha Brown  HOU   G 4 18  0  0   0  NA   0    0   NA   0
## 1621           Jessica Brungo  CON   F 4  7  0  0   0  NA   0    0   NA   0
## 1714            Irina Osipova  DET   C 2 12  0  0   0  NA   0    0   NA   0
## 1785        Ambrosia Anderson  CON   F 1  1  0  0   0  NA   0    0   NA   0
## 1814            Cori Chambers  CON   G 1  1  0  0   0  NA   0    0   NA   0
## 1878             Teana Miller  PHO   C 2  5  0  0   0  NA   0    0   NA   0
## 1924                Kim Smith  SAC   F 3  6  0  0   0  NA   0    0   NA   0
## 2318         Kelly Schumacher  DET   C 1 10  0  0   0  NA   0    0   NA   0
## 3004        Samantha Prahalis  NYL   G 3  8  0  0   0  NA   0    0   NA   0
## 3047          Aaryn Ellenberg  CHI   G 2  1  0  0   0  NA   0    0   NA   0
## 3500           Blake Dietrick  SEA   G 2  6  0  0   0  NA   0    0   NA   0
## 3677          Chelsea Hopkins  NYL   G 1  1  0  0   0  NA   0    0   NA   0
## 3683           Sophie Brunner  SAS   F 1  2  0  0   0  NA   0    0   NA   0
## 3733             Adaora Elonu  ATL   F 1  1  0  0   0  NA   0    0   NA   0
## 3745             Amber Harris  CHI   F 1  2  0  0   0  NA   0    0   NA   0
## 3862            Teana Muldrow  DAL   F 1  3  0  0   0  NA   0    0   NA   0
## 3958              Jaime Nared  LVA F-G 1  1  0  0   0  NA   0    0   NA   0
## 4071              Kaela Davis  ATL   F 2  2  0  0   0  NA   0    0   NA   0
## 4183           Alisia Jenkins  IND   F 1  2  0  0   0  NA   0    0   NA   0
## 4186             Erica McCall  ATL   F 1  5  0  0   0  NA   0    0   NA   0
## 4191           Alisia Jenkins  CHI   F 2  3  0  0   0  NA   0    0   NA   0
## 4237            Aleah Goodman  CON   G 1  3  0  0   0  NA   0    0   NA   0
## 4246            Linnae Harper  MIN   G 1  5  0  0   0  NA   0    0   NA   0
## 4248  Mikiah Herbert Harrigan  SEA   F 1  1  0  0   0  NA   0    0   NA   0
## 4355        Layshia Clarendon  NYL   G 1  3  0  0   0  NA   0    0   NA   0
## 4360            Joyner Holmes  NYL   F 1  5  0  0   0  NA   0    0   NA   0
## 4364 Shatori Walker-Kimbrough  CON   G 1  4  0  0   0  NA   0    0   NA   0
## 4370             Natasha Mack  MIN   F 1  2  0  0   0  NA   0    0   NA   0
## 4405            Kaila Charles  ATL F-G 1  2  0  0   0  NA   0    0   NA   0
## 4477              Raina Perez  SEA   G 1  2  0  0   0  NA   0    0   NA   0
## 4554         Moriah Jefferson  DAL   G 1  4  0  0   0  NA   0    0   NA   0
## 4569           Kiana Williams  CON   G 1  3  0  0   0  NA   0    0   NA   0
## 4639         Khaalia Hillsman  CHI   C 1  1  0  0   0  NA   0    0   NA   0
## 4872             Taylor Soule  MIN   F 2  3  0  0   0  NA   0    0   NA   0
##      X2PA X2P.   FT  FTA  FT.  ORB   TRB  AST  STL  BLK   TOV   PF  PTS Season
## 68      0   NA  0.0  0.0   NA  0.0   0.0  0.0  6.7  0.0   6.7  0.0  0.0   1997
## 88      0   NA  0.0  0.0   NA  0.0  13.0  0.0  0.0  0.0  13.0 25.9  0.0   1997
## 299     0   NA  0.0  0.0   NA  0.0   0.0 14.1  0.0  0.0  28.2  0.0  0.0   1999
## 304     0   NA  0.0  0.0   NA  0.0  38.1  0.0  0.0  0.0   0.0 19.1  0.0   1999
## 308     0   NA  0.0  0.0   NA 58.5  58.5  0.0  0.0  0.0  58.5  0.0  0.0   1999
## 587     0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 38.2  0.0   2000
## 602     0   NA  0.0  0.0   NA  6.1  12.1  6.1  6.1  0.0   6.1 12.1  0.0   2000
## 603     0   NA  0.0  0.0   NA  0.0  28.7 38.2  0.0  0.0   0.0  9.6  0.0   2000
## 610     0   NA 19.6 39.3 0.50 19.6  19.6  0.0  0.0 19.6  19.6 39.3 19.6   2001
## 648     0   NA  0.0 38.4 0.00  0.0   9.6  9.6  0.0  0.0   0.0  0.0  0.0   2001
## 787     0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0  15.5  0.0  0.0   2001
## 810     0   NA 38.0 38.0 1.00 19.0  19.0  0.0  0.0 19.0   0.0 38.0 38.0   2001
## 820     0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 28.7  0.0   2002
## 947     0   NA  0.0  0.0   NA  0.0  20.3  0.0  0.0  0.0   0.0  0.0  0.0   2002
## 972     0   NA 12.2 24.4 0.50 12.2  12.2  0.0 12.2  0.0   0.0  0.0 12.2   2002
## 1038    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 59.1  0.0   2002
## 1092    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2003
## 1204    0   NA  0.0  0.0   NA  0.0   0.0 10.0  0.0  0.0  10.0 10.0  0.0   2003
## 1259    0   NA  4.3  8.6 0.50  0.0  17.2  0.0  8.6  0.0  12.9 21.5  4.3   2004
## 1284    0   NA  0.0  0.0   NA  0.0   9.6  4.8  0.0  2.4   4.8  0.0  0.0   2004
## 1375    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2004
## 1403    0   NA 11.2 14.9 0.75  0.0   3.7  3.7  0.0  0.0   7.4  3.7 11.2   2004
## 1590    0   NA  0.0  0.0   NA  0.0   3.3  6.6  3.3  0.0   6.6  3.3  0.0   2005
## 1621    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   7.5  0.0  0.0   2006
## 1714    0   NA  8.8  8.8 1.00  4.4   4.4  0.0  0.0  0.0   4.4  4.4  8.8   2006
## 1785    0   NA  0.0  0.0   NA  0.0 104.7  0.0  0.0  0.0   0.0  0.0  0.0   2006
## 1814    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0 103.0  0.0  0.0   2007
## 1878    0   NA  0.0 19.1 0.00  0.0   9.5  0.0  0.0  9.5   0.0  9.5  0.0   2007
## 1924    0   NA  0.0  0.0   NA  0.0  17.8  0.0  0.0  0.0   8.9  8.9  0.0   2007
## 2318    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  5.2   0.0 10.4  0.0   2009
## 3004    0   NA  0.0  0.0   NA  0.0  13.1  6.6  6.6  0.0  19.7  6.6  0.0   2013
## 3047    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2014
## 3500    0   NA 17.4 17.4 1.00  0.0   0.0  0.0  0.0  0.0   8.7  8.7 17.4   2016
## 3677    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2017
## 3683    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 26.1  0.0   2017
## 3733    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2018
## 3745    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 25.2  0.0   2018
## 3862    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 16.8  0.0   2018
## 3958    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 49.5  0.0   2019
## 4071    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2020
## 4183    0   NA  0.0  0.0   NA  0.0  25.5  0.0  0.0  0.0   0.0  0.0  0.0   2020
## 4186    0   NA  0.0  0.0   NA  0.0  20.0  0.0 10.0  0.0   0.0 10.0  0.0   2020
## 4191    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2020
## 4237    0   NA  0.0  0.0   NA  0.0   0.0 18.0  0.0  0.0   0.0 18.0  0.0   2021
## 4246    0   NA  0.0  0.0   NA  0.0  10.2  0.0  0.0  0.0   0.0 20.3  0.0   2021
## 4248    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2021
## 4355    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0  33.1 16.5  0.0   2021
## 4360    0   NA  0.0  0.0   NA  0.0   0.0  9.9  0.0  0.0   9.9  9.9  0.0   2021
## 4364    0   NA  0.0  0.0   NA  0.0  13.5  0.0  0.0 13.5  13.5 13.5  0.0   2021
## 4370    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2021
## 4405    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0 25.2  0.0   2022
## 4477    0   NA  0.0  0.0   NA  0.0   0.0 25.5  0.0  0.0   0.0  0.0  0.0   2022
## 4554    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2022
## 4569    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2022
## 4639    0   NA  0.0  0.0   NA  0.0   0.0  0.0  0.0  0.0   0.0  0.0  0.0   2023
## 4872    0   NA  0.0  0.0   NA  0.0  17.2  0.0  0.0  0.0  17.2  0.0  0.0   2024

As we can see, that column is empty when the player did not attempt any field goals that year. Similarly for the 2, 3 points ones and the free throws. We’ll input 0 for these cases.

data$FG.[is.na(data$FG.) & data$FGA == 0] <- 0
data$X3P.[is.na(data$X3P.) & data$X3PA == 0] <- 0
data$X2P.[is.na(data$X2P.) & data$X2PA == 0] <- 0
data$FT.[is.na(data$FT.) & data$FTA == 0] <- 0

missing_values_by_column <- colSums(is.na(data))
print("Final Missing Values by Column:")
## [1] "Final Missing Values by Column:"
print(missing_values_by_column)
## Player   Team    Pos      G     MP     GS     FG    FGA    FG.    X3P   X3PA 
##      0      0      0      0      0      0      0      0      0      0      0 
##   X3P.    X2P   X2PA   X2P.     FT    FTA    FT.    ORB    TRB    AST    STL 
##      0      0      0      0      0      0      0      0      0      0      0 
##    BLK    TOV     PF    PTS Season 
##      0      0      0      0      0
data_shape <- dim(data)
print("Final Shape:")
## [1] "Final Shape:"
print(paste("Rows:", data_shape[1]))
## [1] "Rows: 4927"
print(paste("Columns:", data_shape[2]))
## [1] "Columns: 27"
duplicates <- data[duplicated(data),]
print("Number of Duplicates:")
## [1] "Number of Duplicates:"
print(dim(duplicates)[1])
## [1] 0

Variable Distribution

plot_distribution <- function(data, column, bins) {
  p1 <- ggplot(data, aes_string(x = column)) + 
    geom_histogram(bins = bins, fill = "#E15101", alpha = 0.7) + 
    labs(title = paste("Distribution of", column)) +
    theme(plot.margin = unit(c(1, 1, 1, 1), "cm"))
  
  p2 <- ggplot(data, aes_string(y = column)) + 
    geom_boxplot(fill = "#E15101", alpha = 0.7) + coord_flip() +
    theme(plot.margin = unit(c(0, 1, 0, 2.1), "cm"), 
          axis.ticks.y = element_blank(), axis.text.y = element_blank())
  
  grid.arrange(p1, p2, ncol = 1, heights = c(3, 1))
}

for (col in colnames(data)) {
  if (is.numeric(data[[col]])) {
    bins <- ifelse(grepl("\\.", col), 10, 27)
    plot_distribution(data, col, bins)
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Modelling

Feature Engineering & Selection

correlation_matrix <- cor(data[, sapply(data, is.numeric)], use = "complete.obs")
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
corrplot(correlation_matrix, method = "color", tl.col = "black", tl.srt = 45,  tl.cex = 0.7)

As expected, the field goals and each type of shot attempted and made is correlated with its made percentage. And the number of games played is correlated with the games started and minutes played, and the total and offensive rebounds as well. It’s interesting to note there is a slight negative correlation between rebounds and 3-point shooting, a good clue that we can find some pointers of position by the strong starts per player.

high_correlations <- which(abs(correlation_matrix) > 0.8 & abs(correlation_matrix) < 1, arr.ind = TRUE)

if (length(high_correlations) > 0) {
  for (i in 1:nrow(high_correlations)) {
    row <- high_correlations[i, 1]
    col <- high_correlations[i, 2]
    cat(sprintf("Correlation between %s and %s: %.2f\n", 
                rownames(correlation_matrix)[row], 
                colnames(correlation_matrix)[col], 
                correlation_matrix[row, col]))
  }
} else {
  print("No correlations greater than 80% found.")
}
## Correlation between MP and G: 0.80
## Correlation between G and MP: 0.80
## Correlation between GS and MP: 0.89
## Correlation between MP and GS: 0.89
## Correlation between X2P and FG: 0.89
## Correlation between PTS and FG: 0.93
## Correlation between X2P. and FG.: 0.88
## Correlation between X3PA and X3P: 0.85
## Correlation between X3P and X3PA: 0.85
## Correlation between FG and X2P: 0.89
## Correlation between X2PA and X2P: 0.85
## Correlation between X2P and X2PA: 0.85
## Correlation between FG. and X2P.: 0.88
## Correlation between FTA and FT: 0.91
## Correlation between FT and FTA: 0.91
## Correlation between FG and PTS: 0.93

For this project it’s important to differentiate between the different types of shooting, 2-point, 3-point and free throws, and we will keep the offensive and obtain the defensive rebounds, as well as the other defensive stats; and the total points. It’s important to remember these stats are already somewhat normalized, since they are the stats per 100 possessions.

data <- data %>%
  mutate(DRB = TRB - ORB)

columns_to_keep <- c("Player", "Team", "Season", "Pos", "X3PA", "X3P.","X2PA", "X2P.", "FTA", "FT.", 'PTS', "ORB", "DRB", "AST", "STL", "BLK", "TOV", "PF")
data <- data %>% select(all_of(columns_to_keep))

columns <- colnames(data)
print("Columns:")
## [1] "Columns:"
print(columns)
##  [1] "Player" "Team"   "Season" "Pos"    "X3PA"   "X3P."   "X2PA"   "X2P."  
##  [9] "FTA"    "FT."    "PTS"    "ORB"    "DRB"    "AST"    "STL"    "BLK"   
## [17] "TOV"    "PF"
# save data to a csv
# write.csv(data, file = file.path(data_directory, "wnba_cleaned.csv"), row.names = FALSE)

PCA Implementation

remove_outliers <- function(data) {
  numeric_columns <- sapply(data, is.numeric)
  for (col in names(data)[numeric_columns]) {
    Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
    Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
    IQR <- Q3 - Q1
    lower_bound <- Q1 - 1.5 * IQR
    upper_bound <- Q3 + 1.5 * IQR
    data <- data[data[[col]] >= lower_bound & data[[col]] <= upper_bound, ]
  }
  return(data)
}

data_no_outliers <- remove_outliers(data)
print("Remaining data points:")
## [1] "Remaining data points:"
dim(data_no_outliers)
## [1] 3457   18
# Perform PCA
numeric_cols <- sapply(data_no_outliers, is.numeric)
pca_result <- prcomp(data_no_outliers[, numeric_cols], scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.0166 1.5088 1.2348 0.97423 0.96258 0.91316 0.86266
## Proportion of Variance 0.2905 0.1626 0.1089 0.06779 0.06618 0.05956 0.05316
## Cumulative Proportion  0.2905 0.4531 0.5620 0.62978 0.69597 0.75553 0.80868
##                            PC8     PC9    PC10    PC11   PC12    PC13    PC14
## Standard deviation     0.80937 0.70276 0.67697 0.64214 0.5975 0.52550 0.16044
## Proportion of Variance 0.04679 0.03528 0.03273 0.02945 0.0255 0.01973 0.00184
## Cumulative Proportion  0.85547 0.89075 0.92349 0.95294 0.9784 0.99816 1.00000
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
cumvar_explained <- cumsum(var_explained)

pca_var <- data.frame(
  PC = 1:length(var_explained),
  var_explained = var_explained,
  cum_var = cumvar_explained
)

p1 <- ggplot(pca_var, aes(x = PC, y = var_explained)) +
  geom_col(fill = "#E15101", alpha = 0.7) +
  geom_line(aes(y = cum_var), color = "#2e86c1", size = 1.5) +
  geom_point(aes(y = cum_var), color = "#2e86c1", size = 3) +
  geom_text(aes(y = cum_var, label = sprintf("%.2f", cum_var)), 
            vjust = -0.8, 
            hjust = 0.5, 
            size = 7) + 
  scale_y_continuous(
    name = "Proportion of Variance Explained",
    sec.axis = sec_axis(~., name = "Cumulative Proportion")
  ) +
  labs(x = "Principal Component") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 14),   # Axis numbers
    axis.title = element_text(size = 16),  # Axis titles
    plot.title = element_text(size = 22),  # Plot title
    aspect.ratio = 9/16                    # Force 16:9 aspect ratio
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)

#ggsave("figures/pca_scree.pdf", p1, width = 16, height = 9, units = "in")
# Plot PCA results
data_pca <- as.data.frame(pca_result$x)
data_pca$Player <- data_no_outliers$Player
data_pca$Pos <- data_no_outliers$Pos

ggplot(data_pca, 
       aes(x = PC1, y = PC2, color = Pos, label = Player)) +
  geom_point(alpha = 0.7) +
  labs(title = "PCA of WNBA Player Stats", x = "Principal Component 1", y = "Principal Component 2") +
  theme_minimal()

The two principal components explain 55% of the variance, which is not ideal, but it’s a good start. And we can see some clear zones for some of the positions with the first two, Guards are to the left, Forwards and Centers scattered but mostly right.

# 3D scatter plot
plot_ly(data_pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Pos, 
        type = "scatter3d", mode = "markers", marker = list(size = 3)) %>%
  layout(title = "3D PCA of WNBA Player Stats",
         scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')))

t-SNE

data_unique <- data_pca
data_unique$Player <- data_pca$Player
data_unique$Pos <- data_pca$Pos
data_unique <- data_unique[!duplicated(data_unique),]

# Perform t-SNE
set.seed(42)
tsne_result <- Rtsne(
  data_unique, dims = 3, perplexity = 30, verbose = TRUE, max_iter = 500
  )
## Performing PCA
## Read the 3456 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 1.52 seconds (sparsity = 0.037943)!
## Learning embedding...
## Iteration 50: error is 84.151462 (50 iterations in 2.75 seconds)
## Iteration 100: error is 80.952215 (50 iterations in 2.81 seconds)
## Iteration 150: error is 80.910261 (50 iterations in 1.25 seconds)
## Iteration 200: error is 80.910362 (50 iterations in 1.26 seconds)
## Iteration 250: error is 80.910384 (50 iterations in 1.12 seconds)
## Iteration 300: error is 2.659008 (50 iterations in 2.48 seconds)
## Iteration 350: error is 2.245123 (50 iterations in 1.57 seconds)
## Iteration 400: error is 2.076515 (50 iterations in 1.78 seconds)
## Iteration 450: error is 1.981314 (50 iterations in 1.87 seconds)
## Iteration 500: error is 1.921614 (50 iterations in 1.86 seconds)
## Fitting performed in 18.76 seconds.
data_tsne <- as.data.frame(tsne_result$Y)
data_tsne$Player <- data_unique$Player
data_tsne$Pos<- data_unique$Pos

print("t-SNE Data Size:")
## [1] "t-SNE Data Size:"
print(dim(data_tsne))
## [1] 3456    5
# Create a 2D scatter plot
ggplot(data_tsne, aes(x = V1, y = V2, color = Pos, label = Player)) +
  geom_point(alpha = 0.7) +
  labs(title = "t-SNE of WNBA Player Stats",
       x = "Dimension 1", y = "Dimension 2") +
  theme_minimal()

# Create a 3D scatter plot
plot_ly(data_tsne, x = ~V1, y = ~V2, z = ~V3, color = ~Pos, 
        type = "scatter3d", mode = "markers", marker = list(size = 2)) %>%
  layout(#title = "t-SNE of WNBA Player Stats",
         scene = list(xaxis = list(title = 'Dim 1'),
                      yaxis = list(title = 'Dim 2'),
                      zaxis = list(title = 'Dim 3')))

As we can see, chaining these two methods of dimensionality reduction we can see a clearer separation of the positions, The guards occupy a clear region, and the Forwards another, with the centers at the end of the cluster mixed in.

Clustering

We will attempt to retrieve the positions from the stats, to see if there is such a thing as a style of play corresponding to the labeled positions. We will do so using the original data, and the one reduced and transformed through t-SNE.

K-Means

calculate_wss <- function(data, k) {
  kmeans_result <- kmeans(data, centers = k, nstart = 25)
  return(kmeans_result$tot.withinss)
}

cross_validate_kmeans <- function(data, k_values, folds = 5) {
  set.seed(42)
  fold_indices <- createFolds(data[, 1], k = folds, list = TRUE, returnTrain = TRUE)
  wss_values <- sapply(k_values, function(k) {
    fold_wss <- sapply(fold_indices, function(indices) {
      train_data <- data[indices, ]
      calculate_wss(train_data, k)
    })
    mean(fold_wss)
  })
  return(wss_values)
}

numeric_cols <- sapply(data, is.numeric)
data1 <- data[, numeric_cols]
data2 <- data_tsne[, 1:3]

k_values <- 1:10

wss_values1 <- cross_validate_kmeans(data1, k_values)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
wss_values2 <- cross_validate_kmeans(data2, k_values)

wss_df1 <- data.frame(k = k_values, wss = wss_values1)
wss_df2 <- data.frame(k = k_values, wss = wss_values2)
plot1 <- ggplot(wss_df1, aes(x = k, y = wss)) +
  geom_line(color = "#2e86c1") +
  geom_point(color = "#2e86c1") +
  labs(title = "Original Data", x = "Number of Clusters (k)", y = "Within-Cluster Sum of Squares (WSS)") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 16),   
    axis.title = element_text(size = 18),  
    plot.title = element_text(size = 24),  
    aspect.ratio = 9/16                    
  )

plot2 <- ggplot(wss_df2, aes(x = k, y = wss)) +
  geom_line(color = "#2e86c1") +
  geom_point(color = "#2e86c1") +
  labs(title = "Data t-SNE", x = "Number of Clusters (k)", y = "Within-Cluster Sum of Squares (WSS)") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 16),   
    axis.title = element_text(size = 18),  
    plot.title = element_text(size = 24),  
    aspect.ratio = 9/16                    
  )

combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)

grid.arrange(combined_plots)

grid.arrange(plot1, plot2, ncol = 2)

#ggsave("figures/kmeans_elbow.pdf", combined_plots, width = 16, height = 9, units = "in")

There isn’t a clear winner. But we know our data set has 5 labels, so we will use that as the number of clusters.

# plot k_means_tsn result in 3d
kmeans_result <- kmeans(data1, centers = 5, nstart = 25)
kmeans_result_tsne <- kmeans(data2, centers = 5, nstart = 25)

plot_ly(data_tsne, x = ~V1, y = ~V2, z = ~V3, color = ~as.factor(kmeans_result_tsne$cluster), 
        type = "scatter3d", mode = "markers", marker = list(size = 3)) %>%
  layout(#title = "3D K-Means Clustering of WNBA Player Stats",
         scene = list(xaxis = list(title = 'Dim 1'),
                      yaxis = list(title = 'Dim 2'),
                      zaxis = list(title = 'Dim 3')))

DBSCAN

cross_validate_dbscan <- function(data, eps_values, minPts_values, folds = 5) {
  set.seed(42)
  fold_indices <- createFolds(data[, 1], k = folds, list = TRUE, returnTrain = TRUE)
  
  results <- expand.grid(eps = eps_values, minPts = minPts_values)
  results$silhouette <- NA
  
  for(i in 1:nrow(results)) {
    eps <- results$eps[i]
    minPts <- results$minPts[i]
    
    fold_silhouette <- sapply(fold_indices, function(indices) {
      train_data <- data[indices, ]
      dbscan_result <- dbscan(train_data, eps = eps, minPts = minPts)
      
      if(length(unique(dbscan_result$cluster)) > 1 && 
         sum(dbscan_result$cluster == 0) < (nrow(train_data) - 1)) {
        sil <- silhouette(dbscan_result$cluster, dist(train_data))
        mean(sil[, 3])
      } else {
        return(NA)
      }
    })
    
    results$silhouette[i] <- mean(fold_silhouette, na.rm = TRUE)
  }
  
  return(results)
}

eps_values <- seq(0.1, 1, by = 0.1)
minPts_values <- 3:8

results1 <- cross_validate_dbscan(data1, eps_values, minPts_values)
results2 <- cross_validate_dbscan(data2, eps_values, minPts_values)
# Plot the results
plot1 <- ggplot(results1, aes(x = eps, y = minPts, fill = silhouette)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "#2e86c1", na.value = "grey90") +
  labs(title = "Original Data",
       x = "Epsilon", y = "MinPoints") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 16),   
    axis.title = element_text(size = 18),  
    plot.title = element_text(size = 24),  
    #aspect.ratio = 9/16                    
  )

plot2 <- ggplot(results2, aes(x = eps, y = minPts, fill = silhouette)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "#2e86c1", na.value = "grey90") +
  labs(title = "t-SNE Data",
       x = "Epsilon", y = "MinPoints") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 16),   
    axis.title = element_text(size = 18),  
    plot.title = element_text(size = 24),  
    #aspect.ratio = 9/16                    
  )

combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
grid.arrange(combined_plots)

#ggsave("figures/dbscan.pdf", combined_plots, width = 16, height = 9, units = "in")

# Get best parameters
best_params1 <- results1[which.max(results1$silhouette), ]
best_params2 <- results2[which.max(results2$silhouette), ]

cat("Best parameters for original data:\n",
    "eps =", best_params1$eps, ", minPts =", best_params1$minPts,
    "\nSilhouette score:", best_params1$silhouette, "\n\n")
## Best parameters for original data:
##  eps = 0.1 , minPts = 7 
## Silhouette score: 0.3890026
cat("Best parameters for t-SNE data:\n",
    "eps =", best_params2$eps, ", minPts =", best_params2$minPts,
    "\nSilhouette score:", best_params2$silhouette, "\n")
## Best parameters for t-SNE data:
##  eps = 1 , minPts = 3 
## Silhouette score: 0.1128639
dbscan1 <- dbscan(data1, 
                  eps = best_params1$eps, 
                  minPts = best_params1$minPts)

dbscan2 <- dbscan(data2, 
                  eps = best_params2$eps, 
                  minPts = best_params2$minPts)

plot1 <- ggplot(data1, 
                aes(x = X3PA, y = AST)) +
  geom_point(aes(color = factor(dbscan1$cluster)), alpha = 0.7) +
  labs(title = "DBSCAN Clusters (Original Data)", 
       subtitle = paste("eps =", round(best_params1$eps, 2), 
                       ", minPts =", best_params1$minPts,
                       "\nSilhouette =", round(best_params1$silhouette, 3)),
       color = "Cluster") +
  theme_minimal()

plot2 <- plot_ly(data.frame(data2),
                 x = ~V1, y = ~V2, z = ~V3, 
                 color = factor(dbscan2$cluster),
                 type = "scatter3d", 
                 mode = "markers",
                 marker = list(size = 3)) %>%
  layout(title = paste("DBSCAN Clusters (t-SNE Data)\n",
                      "eps =", round(best_params2$eps, 2),
                      ", minPts =", best_params2$minPts,
                      ", Silhouette =", round(best_params2$silhouette, 3)))

print(plot1)

print(plot2)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
cat("\nOriginal Data - Number of clusters:", length(unique(dbscan1$cluster)) - 1, 
    "\nNoise points:", sum(dbscan1$cluster == 0), "\n")
## 
## Original Data - Number of clusters: 1 
## Noise points: 4915
cat("\nt-SNE Data - Number of clusters:", length(unique(dbscan2$cluster)) - 1, 
    "\nNoise points:", sum(dbscan2$cluster == 0), "\n")
## 
## t-SNE Data - Number of clusters: 472 
## Noise points: 825

DBScan is not picking up any differences between the data, putting everything in the same cluster even when we did hyperparameter tuning. We saw initially how there’s no significant spatial separation between the different player positions in our point cloud which are the cases this algorithm thrives in.

Hierarchical Clustering

cross_validate_hclust <- function(
    data, k_values, methods = c("complete", "average", "single", "ward.D2"), folds = 5) {
  set.seed(42)
  fold_indices <- createFolds(1:nrow(data), k = folds, list = TRUE, returnTrain = TRUE)
  
  results <- expand.grid(k = k_values, method = methods)
  results$silhouette <- NA
  
  for(i in 1:nrow(results)) {
    k <- results$k[i]
    method <- as.character(results$method[i])
    
    fold_silhouette <- sapply(fold_indices, function(indices) {
      train_data <- data[indices, ]
      
      dist_matrix <- dist(train_data, method = "euclidean")
      hc <- hclust(dist_matrix, method = method)
      clusters <- cutree(hc, k = k)
      
      if(length(unique(clusters)) > 1) {
        sil <- silhouette(clusters, dist_matrix)
        mean(sil[, 3])
      } else {
        return(NA)
      }
    })
    
    results$silhouette[i] <- mean(fold_silhouette, na.rm = TRUE)
  }
  
  return(results)
}

data1 <- scale(data[, numeric_cols])
data2 <- scale(data_tsne[, 1:3])

k_values <- 2:10
methods <- c("complete", "average", "single", "ward.D2")

results1 <- cross_validate_hclust(data1, k_values, methods)
results2 <- cross_validate_hclust(data2, k_values, methods)
plot1 <- ggplot(results1, aes(x = k, y = silhouette, color = method)) +
  geom_line() +
  geom_point() +
  labs(title = "Original Data", 
       x = "Number of Clusters (k)", 
       y = "Silhouette Score") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 18),   
    axis.title = element_text(size = 20),  
    plot.title = element_text(size = 26),  
    #aspect.ratio = 9/16                    
  )

plot2 <- ggplot(results2, aes(x = k, y = silhouette, color = method)) +
  geom_line() +
  geom_point() +
  labs(title = "t-SNE Data", 
       x = "Number of Clusters (k)", 
       y = "Silhouette Score") +
  theme_minimal() +
  theme(
      axis.text = element_text(size = 18),   
      axis.title = element_text(size = 20),  
      plot.title = element_text(size = 26),  
      #aspect.ratio = 9/16                    
    )

combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
grid.arrange(combined_plots)

#ggsave("figures/hierarchical_cv.pdf", combined_plots, width = 16, height = 9, units = "in")

# Get best parameters
best_params1 <- results1[which.max(results1$silhouette), ]
best_params2 <- results2[which.max(results2$silhouette), ]

cat("Best parameters for original data:\n",
    "k =", best_params1$k, ", method =", best_params1$method,
    "\nSilhouette score:", best_params1$silhouette, "\n\n")
## Best parameters for original data:
##  k = 2 , method = 3 
## Silhouette score: 0.8724845
cat("Best parameters for t-SNE data:\n",
    "k =", best_params2$k, ", method =", best_params2$method,
    "\nSilhouette score:", best_params2$silhouette, "\n")
## Best parameters for t-SNE data:
##  k = 4 , method = 2 
## Silhouette score: 0.3005942
dist_matrix1 <- dist(data1, method = "euclidean")
hc1 <- hclust(dist_matrix1, method = best_params1$method)
clusters1 <- cutree(hc1, k = best_params1$k)

dist_matrix2 <- dist(data2, method = "euclidean")
hc2 <- hclust(dist_matrix2, method = best_params2$method)
clusters2 <- cutree(hc2, k = best_params2$k)

plot1 <- ggplot(data, 
                aes(x = X3PA, y = AST)) +
  geom_point(aes(color = factor(clusters1)), alpha = 0.7) +
  labs(title = "Hierarchical Clusters (Original Data)", 
       subtitle = paste("k =", best_params1$k, 
                       ", method =", best_params1$method,
                       "\nSilhouette =", round(best_params1$silhouette, 3)),
       color = "Cluster") +
  theme_minimal()

plot2 <- plot_ly(data.frame(data_tsne), 
                 x = ~V1, y = ~V2, z = ~V3, 
                 color = factor(clusters2),
                 type = "scatter3d", 
                 mode = "markers",
                 marker = list(size = 3)) %>%
  layout(title = paste("Hierarchical Clusters (t-SNE Data)\n",
                      "k =", best_params2$k,
                      ", method =", best_params2$method,
                      ", Silhouette =", round(best_params2$silhouette, 3)))

print(plot1)

print(plot2)

cat("\nOriginal Data - Number of clusters:", length(unique(clusters1)), 
    "\nCluster sizes:", table(clusters1), "\n")
## 
## Original Data - Number of clusters: 2 
## Cluster sizes: 4926 1
cat("\nt-SNE Data - Number of clusters:", length(unique(clusters2)), 
    "\nCluster sizes:", table(clusters2), "\n")
## 
## t-SNE Data - Number of clusters: 4 
## Cluster sizes: 946 728 1059 723
par(mfrow = c(1, 2))
plot(hc1, 
     main = paste("Dendrogram (Original Data)\nMethod:", best_params1$method),
     xlab = "", 
     sub = "", 
     cex = 0.6,
     hang = -1) 
rect.hclust(hc1, k = best_params1$k, border = "red")

plot(hc2, 
     main = paste("Dendrogram (t-SNE Data)\nMethod:", best_params2$method),
     xlab = "", 
     sub = "", 
     cex = 0.6,
     hang = -1)
rect.hclust(hc2, k = best_params2$k, border = "red")

par(mfrow = c(1, 1))
par(mar = c(8, 4, 4, 2))

plot(hc2, 
     #main = paste("t-SNE Data)\nMethod:", best_params2$method),
     xlab = "", 
     sub = "", 
     cex = 0.6,
     hang = -1,
     axes = FALSE,
     labels = FALSE
     )
rect.hclust(hc2, k = best_params2$k, border = "#2e86c1")

labels <- paste0(data_tsne$Player, "\n", data_tsne$Pos)[hc2$order]
n_labels <- 16
indices <- round(seq(1, length(hc2$order), length.out = n_labels))
axis(1, at = indices, 
     labels = labels[indices],
     las = 2,      
     srt = 70,     
     cex.axis = 0.8)
# Add y-axis
axis(2)

# pdf("figures/dendrogram.pdf", width = 16, height = 9)
# par(mar = c(12, 4, 4, 2))  # Increase bottom margin for labels
# 
# plot(hc2,
#      main = "",
#      xlab = "", 
#      sub = "", 
#      cex = 0.8,           # Base text size
#      cex.main = 2,        # Title size
#      cex.axis = 1.2,      # Axis text size
#      cex.lab = 1.4,       # Axis labels size
#      hang = -1,
#      axes = FALSE,
#      labels = FALSE
# )
# rect.hclust(hc2, k = best_params2$k, border = "#2e86c1")
# 
# # Add labels with increased size
# labels <- paste0(data_tsne$Player, "\n", data_tsne$Pos)[hc2$order]
# n_labels <- 16
# indices <- round(seq(1, length(hc2$order), length.out = n_labels))
# axis(1, 
#      at = indices, 
#      labels = labels[indices],
#      las = 2,      
#      cex.axis = 1.2,     # Increase label size
#      padj = 0.5)         # Adjust label position
# # Add y-axis
# axis(2, cex.axis = 1.2)  # Increase y-axis label size
# 
# dev.off()

Gaussian Mixture Models (GMMs)

cross_validate_gmm <- function(data, G_values, folds = 5) {
  set.seed(42)
  fold_indices <- createFolds(1:nrow(data), k = folds, list = TRUE, returnTrain = TRUE)
  
  results <- data.frame(G = G_values, bic = NA)
  
  for(i in seq_along(G_values)) {
    G <- G_values[i]
    
    # Calculate BIC scores across folds
    fold_bic <- sapply(fold_indices, function(indices) {
      train_data <- data[indices, ]
      gmm <- Mclust(train_data, G = G)
      return(gmm$bic)
    })
    
    results$bic[i] <- mean(fold_bic)
  }
  
  return(results)
}

data1 <- scale(data[, numeric_cols])
data2 <- scale(data_tsne[, 1:3])

G_values <- 1:10

results1 <- cross_validate_gmm(data1, G_values)
results2 <- cross_validate_gmm(data2, G_values)

plot1 <- ggplot(results1, aes(x = G, y = bic)) +
  geom_line() +
  geom_point() +
  labs(title = "GMM Model Selection\n(Original Data)", 
       x = "Number of Components", 
       y = "BIC Score") +
  theme_minimal()

plot2 <- ggplot(results2, aes(x = G, y = bic)) +
  geom_line() +
  geom_point() +
  labs(title = "GMM Model Selection\n(t-SNE Data)", 
       x = "Number of Components", 
       y = "BIC Score") +
  theme_minimal()

grid.arrange(plot1, plot2, ncol = 2)

best_G1 <- G_values[which.min(results1$bic)]
best_G2 <- G_values[which.min(results2$bic)]
# Plot results with improved styling
# plot1 <- ggplot(results1, aes(x = G, y = bic)) +
#   geom_line(color = "#2e86c1", size = 1.5) +
#   geom_point(color = "#2e86c1", size = 3) +
#   labs(title = "Original Data", 
#        x = "Number of Components", 
#        y = "BIC Score") +
#   theme_minimal() +
#   scale_y_continuous(labels = function(x) format(x/1000, scientific = FALSE, big.mark = ",", suffix = "k")) +
#   theme(
#     axis.text = element_text(size = 16),   # Axis numbers
#     axis.title = element_text(size = 18),  # Axis titles
#     plot.title = element_text(size = 24),  # Plot title
#     aspect.ratio = 9/16                    # Force 16:9 aspect ratio
#   )
# 
# plot2 <- ggplot(results2, aes(x = G, y = bic)) +
#   geom_line(color = "#2e86c1", size = 1.5) +
#   geom_point(color = "#2e86c1", size = 3) +
#   labs(title = "t-SNE Data", 
#        x = "Number of Components", 
#        y = "BIC Score") +
#   theme_minimal() +
#   scale_y_continuous(labels = function(x) format(x/1000, scientific = FALSE, big.mark = ",", suffix = "k")) +
#   theme(
#     axis.text = element_text(size = 16),   
#     axis.title = element_text(size = 18),  
#     plot.title = element_text(size = 24),  
#     aspect.ratio = 9/16                    
#   )
# 
# # Combine plots and save
# combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
# grid.arrange(combined_plots)
# ggsave("figures/gmm_cv.pdf", combined_plots, width = 16, height = 6, units = "in")
gmm1 <- Mclust(data1, G = best_G1)
gmm2 <- Mclust(data2, G = best_G2)

plot1 <- ggplot(data, 
                aes(x = X3PA, y = X2P.)) +
  geom_point(aes(color = factor(gmm1$classification)), alpha = 0.7) +
  labs(title = "GMM Clusters (Original Data)", 
       subtitle = paste("Components =", best_G1,
                       "\nBIC =", round(min(results1$bic), 2)),
       color = "Cluster") +
  theme_minimal()

plot2 <- plot_ly(data.frame(data_tsne), 
                 x = ~V1, y = ~V2, z = ~V3, 
                 color = factor(gmm2$classification),
                 type = "scatter3d", 
                 mode = "markers",
                 marker = list(size = 3)) %>%
  layout(title = paste("GMM Clusters (t-SNE Data)\n",
                      "Components =", best_G2,
                      "\nBIC =", round(min(results2$bic), 2)))

print(plot1)

print(plot2)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
cat("\nOriginal Data - Number of components:", best_G1, 
    "\nCluster sizes:", table(gmm1$classification), "\n")
## 
## Original Data - Number of components: 1 
## Cluster sizes: 4927
cat("\nt-SNE Data - Number of components:", best_G2, 
    "\nCluster sizes:", table(gmm2$classification), "\n")
## 
## t-SNE Data - Number of components: 1 
## Cluster sizes: 3456

Results

true_labels <- as.numeric(as.factor(data$Pos))
true_labels_tsne <- as.numeric(as.factor(data_tsne$Pos))

homogeneity_kmeans1 <- NMI(true_labels, kmeans_result$cluster)
homogeneity_dbscan1 <- NMI(true_labels, dbscan1$cluster)
homogeneity_hclust1 <- NMI(true_labels, clusters1)
homogeneity_gmm1 <- NMI(true_labels, gmm1$classification)

homogeneity_kmeans2 <- NMI(true_labels_tsne, kmeans_result_tsne$cluster)
homogeneity_dbscan2 <- NMI(true_labels_tsne, dbscan2$cluster)
homogeneity_hclust2 <- NMI(true_labels_tsne, clusters2)
homogeneity_gmm2 <- NMI(true_labels_tsne, gmm2$classification)

results_df <- data.frame(
  Method = c("K-means", "DBSCAN", "Hierarchical", "GMM"),
  Original_Data = c(homogeneity_kmeans1, homogeneity_dbscan1, 
                   homogeneity_hclust1, homogeneity_gmm1),
  TSNE_Data = c(homogeneity_kmeans2, homogeneity_dbscan2, 
                homogeneity_hclust2, homogeneity_gmm2)
)

print("Homogeneity Scores:")
## [1] "Homogeneity Scores:"
print(results_df)
##         Method Original_Data TSNE_Data
## 1      K-means  0.1138672811 0.3483866
## 2       DBSCAN  0.0002533170 0.1754827
## 3 Hierarchical  0.0001306353 0.2187598
## 4          GMM  0.0000000000 0.0000000
results_long <- tidyr::pivot_longer(results_df, 
                                  cols = c("Original_Data", "TSNE_Data"),
                                  names_to = "Dataset",
                                  values_to = "Score")

plot <- ggplot(results_long, aes(x = Method, y = Score, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(#title = "Clustering Methods Comparison",
       y = "Homogeneity Score",
       x = "Clustering Method") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 16),
    axis.title = element_text(size = 18),
    plot.title = element_text(size = 24),
    aspect.ratio = 9/16
  )
  #theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)

#ggsave("figures/homogeneity_scores.pdf", plot, width = 16, height = 9, units = "in")
data_tsne$cluster <- kmeans_result_tsne$cluster

cluster_means <- data_tsne %>%
  group_by(cluster) %>%
  summarize(across(c("V1", "V2", "V3"), mean)) %>%
  ungroup()

cluster_matrix <- as.matrix(cluster_means[, -1])
rownames(cluster_matrix) <- paste("Cluster", cluster_means$cluster)

scaled_matrix <- scale(cluster_matrix)

pheatmap(scaled_matrix,
         main = "Cluster Profiles - K-means on t-SNE Data",
         angle_col = 45,
         display_numbers = TRUE,
         number_format = "%.1f",
         fontsize_number = 7,
         cluster_rows = FALSE,  # Don't cluster the rows
         cluster_cols = TRUE)   # Allow clustering of features

data$cluster <- kmeans_result$cluster

cluster_means <- data %>%
  group_by(cluster) %>%
  summarize(across(head(names(numeric_cols)[numeric_cols], -1), mean)) %>%
  ungroup()

cluster_matrix <- as.matrix(cluster_means[, -1])
rownames(cluster_matrix) <- paste("Cluster", cluster_means$cluster)

scaled_matrix <- scale(cluster_matrix)

pheatmap(scaled_matrix,
         main = "",
         angle_col = 45,
         display_numbers = TRUE,
         number_format = "%.1f",
         fontsize_number = 8,
         cluster_rows = FALSE,  
         cluster_cols = TRUE,
         legend = FALSE,
         #filename = "figures/final_heatmap.pdf"
         )

# For t-SNE data k-means clusters
cluster_positions <- data_tsne %>%
  group_by(cluster) %>%
  count(Pos) %>%
  arrange(cluster, desc(n)) %>%
  group_by(cluster) %>%
  slice(1) %>%
  ungroup()

print("Majority positions in t-SNE k-means clusters:")
## [1] "Majority positions in t-SNE k-means clusters:"
print(cluster_positions)
## # A tibble: 5 × 3
##   cluster Pos       n
##     <int> <chr> <int>
## 1       1 G       623
## 2       2 F       430
## 3       3 F       358
## 4       4 G       760
## 5       5 F       292
# For original data k-means clusters
cluster_positions_orig <- data %>%
  group_by(cluster) %>%
  count(Pos) %>%
  arrange(cluster, desc(n)) %>%
  group_by(cluster) %>%
  slice(1) %>%
  ungroup()

print("\nMajority positions in original data k-means clusters:")
## [1] "\nMajority positions in original data k-means clusters:"
print(cluster_positions_orig)
## # A tibble: 5 × 3
##   cluster Pos       n
##     <int> <chr> <int>
## 1       1 G       600
## 2       2 F       262
## 3       3 F       589
## 4       4 G       310
## 5       5 G       784
# Optional: Add percentages
cluster_details <- data_tsne %>%
  group_by(cluster) %>%
  summarise(
    majority_pos = names(which.max(table(Pos))),
    majority_count = max(table(Pos)),
    total_count = n(),
    percentage = round(max(table(Pos)) / n() * 100, 1)
  )

print("\nDetailed cluster composition (t-SNE):")
## [1] "\nDetailed cluster composition (t-SNE):"
print(cluster_details)
## # A tibble: 5 × 5
##   cluster majority_pos majority_count total_count percentage
##     <int> <chr>                 <int>       <int>      <dbl>
## 1       1 G                       623         658       94.7
## 2       2 F                       430         646       66.6
## 3       3 F                       358         771       46.4
## 4       4 G                       760         797       95.4
## 5       5 F                       292         584       50